CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     reordered.f
C
C     This program uses a four-point stencil to smooth a 1,000 by 1,000
C     array.  The smoothing is done on a single processor.  This version
C     attempts to provide a more efficient serial version than naive.f.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      PROGRAM MAIN
      IMPLICIT NONE
      INTEGER prob_size
      REAL close_enough
      PARAMETER (prob_size=1000, close_enough=0.1)
      REAL stencil(0:prob_size-1, 0:prob_size-1)
    
      PRINT *, "initializing the array."
      CALL init_stencil(stencil, prob_size, prob_size, prob_size)
      PRINT *, "computing the stencil."
      CALL compute_stencil(stencil, prob_size, prob_size, close_enough)
      PRINT *, "ending the program."

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     init_stencil
C
C     This routine reads in the initial values for the array over which
C     smoothing will occur.  The whole array is read in from the file.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE init_stencil(stencil, prob_size, m, n)
      IMPLICIT NONE
      INTEGER prob_size, m, n, i, j
      REAL stencil(0:m-1, 0:n-1)

      OPEN(UNIT=1, FILE="stencil.dat", STATUS="OLD", ACTION="READ",
     1   ACCESS="DIRECT", FORM="UNFORMATTED", RECL=4*prob_size)
      DO j=0, n-1
         READ (1, REC=j+1) (stencil(i,j), i=0, m-1)
      END DO
      CLOSE(UNIT=1)

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     compute_stencil
C
C     This routine smooths the values of the array.  The routine sweeps
C     over consecutive columns of the array until it reaches the end, then
C     it checks the maximum error for that sweep, local_err, to see whether
C     it is below tolerance.
C
C     Notice that Fortran stores arrays in column-major order, so sweeping
C     across columns instead of rows will provide much more efficient
C     execution.  It especially makes better use of data cache.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE compute_stencil(stencil, m, n, close_enough)
      IMPLICIT NONE
      REAL stencil(0:m-1, 0:n-1)
      INTEGER m, n
      REAL close_enough
      REAL local_err, old_value
      INTEGER i, j, iter_count

      iter_count = 0
 100  CONTINUE
	 local_err = 0.0
	 iter_count = iter_count + 1

	 DO j=1, n-2
	    DO i=1, m-2
	       old_value = stencil(i,j)

	       stencil(i,j) = ( stencil(i-1, j ) +
     1                          stencil(i+1, j ) +
     2                          stencil( i ,j-1) +
     3                          stencil( i ,j+1) ) / 4
               local_err = MAX(local_err,ABS(old_value-stencil(i,j)))
	    END DO
	 END DO
	 IF(MOD(iter_count,100).EQ.0)PRINT *, iter_count, local_err
      IF (close_enough.LT.local_err) GOTO 100
      PRINT *, "convergence reached after ", iter_count, " iterations."

      END
